rm(list=ls())
set.seed(123)
# Data Manipulation
library(tidyverse) # Comprehensive data manipulation and visualization
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr) # Data manipulation (part of tidyverse)
library(stringr) # String manipulation
library(reshape2) # Data reshaping and melting
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
# Visualization
library(ggplot2) # Data visualization
library(corrplot) # Correlation matrix visualization
## corrplot 0.92 loaded
library(ggrepel) # Improved text label placement
library(hrbrthemes) # Ggplot themes
library(viridis) # Color scales for ggplot2
## Loading required package: viridisLite
# Statistical Analysis and Clustering
library(factoextra) # PCA and clustering visualization
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster) # Clustering algorithms
library(mclust) # Model-based clustering
## Package 'mclust' version 6.1.1
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
##
## The following object is masked from 'package:purrr':
##
## map
library(nFactors) # Determining number of factors
## Loading required package: lattice
##
## Attaching package: 'nFactors'
##
## The following object is masked from 'package:lattice':
##
## parallel
library(insight) # Extract and interpret model information
## Warning: package 'insight' was built under R version 4.4.1
library(caret) # Machine learning
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
pitching_df <- read.csv("MLB_pitching_stats_ALL_PITCHING.csv")
head(pitching_df)
dim(pitching_df)
## [1] 4233 108
# Count the number of 0 values in each column
colSums(pitching_df == 0)
## X year
## 1 0
## playerId playerName
## 0 0
## type rank
## 0 0
## playerFullName playerFirstName
## 0 0
## playerLastName playerUseName
## 0 0
## playerInitLastName teamId
## 0 0
## teamAbbrev teamName
## 0 0
## teamShortName leagueName
## 0 0
## leagueId positionAbbrev
## 0 0
## position primaryPositionAbbrev
## 0 0
## winningPercentage runsScoredPer9
## 0 0
## battersFaced babip
## 0 0
## obp slg
## 49 86
## ops strikeoutsPer9
## 49 0
## baseOnBallsPer9 homeRunsPer9
## 0 0
## hitsPer9 strikesoutsToWalks
## 0 0
## inheritedRunners inheritedRunnersScored
## 1651 2168
## bequeathedRunners bequeathedRunnersScored
## 809 1565
## stolenBases caughtStealing
## 1439 2411
## qualityStarts gamesFinished
## 3192 1238
## doubles triples
## 535 2550
## gidp gidpOpp
## 1124 191
## wildPitches balks
## 1684 3636
## pickoffs totalSwings
## 3455 1
## swingAndMisses ballsInPlay
## 166 5
## runSupport strikePercentage
## 473 0
## pitchesPerInning pitchesPerPlateAppearance
## 0 0
## walksPerPlateAppearance strikeoutsPerPlateAppearance
## 266 274
## homeRunsPerPlateAppearance walksPerStrikeout
## 629 0
## iso flyOuts
## 264 189
## popOuts lineOuts
## 582 484
## groundOuts flyHits
## 126 485
## popHits lineHits
## 3682 246
## groundHits gamesPlayed
## 384 0
## gamesStarted airOuts
## 2423 75
## runs homeRuns
## 225 629
## strikeOuts baseOnBalls
## 274 266
## intentionalWalks hits
## 2837 86
## hitByPitch avg
## 1377 86
## atBats stolenBasePercentage
## 0 0
## groundIntoDoublePlay numberOfPitches
## 1124 0
## era inningsPitched
## 0 5
## wins losses
## 1597 1472
## saves saveOpportunities
## 3245 2677
## holds blownSaves
## 2493 2935
## earnedRuns whip
## 244 0
## outs gamesPitched
## 5 0
## completeGames shutouts
## 4092 4147
## strikes hitBatsmen
## 0 1377
## totalBases groundOutsToAirouts
## 86 0
## winPercentage strikeoutWalkRatio
## 0 0
## strikeoutsPer9Inn walksPer9Inn
## 0 0
## hitsPer9Inn catchersInterference
## 0 3908
## sacBunts sacFlies
## 2890 1842
par(mar = c(5, 10, 4, 2)) # Increase left margin to fit labels
barplot(colSums(pitching_df == 0)[colSums(pitching_df == 0) >= 2], las=2, horiz = T)
# Plot games played
ggplot(pitching_df, aes(x = gamesPlayed)) +
geom_histogram(binwidth = 1, fill = "darkblue", color = "black")+
geom_vline(xintercept = 5, color = "red", linetype = "dashed")
# Correct pitching_df types for some character columns
pitching_df <- pitching_df %>%
mutate(across(c("winPercentage", "runsScoredPer9", "babip", "strikeoutsPer9", "baseOnBallsPer9", "homeRunsPer9", "hitsPer9", "strikesoutsToWalks", "pitchesPerInning", "walksPerStrikeout", "stolenBasePercentage", "era", "whip", "groundOutsToAirouts", "winPercentage", "winPercentage", "strikeoutWalkRatio", "strikeoutsPer9Inn", "walksPer9Inn", "hitsPer9Inn"), as.numeric))
## Warning: There were 18 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(...)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 17 remaining warnings.
pitching_df <- pitching_df %>% filter(year == 2024)
pitching_df <- pitching_df %>% filter(gamesPlayed > 5)
pitching_df$gidpOpp <- ifelse(pitching_df$battersFaced > 0, pitching_df$gidpOpp / pitching_df$battersFaced, NA)
pitching_df$swingAndMisses <- ifelse(pitching_df$battersFaced > 0, pitching_df$swingAndMisses / pitching_df$battersFaced, NA)
pitching_df$ballsInPlay <- ifelse(pitching_df$battersFaced > 0, pitching_df$ballsInPlay / pitching_df$battersFaced, NA)
# Define the columns to mark for deletion
columns_to_delete <- c("X",
"playerId", "year", "playerName", "type", "rank", "playerFirstName",
"playerLastName", "playerUseName", "playerInitLastName", "teamId",
"runSupport", "teamAbbrev", "teamName", "leagueName", "leagueId",
"position", "primaryPositionAbbrev", "ops",
"strikesoutsToWalks", "stolenBases", "caughtStealing", "doubles",
"triples", "gidp", "wildPitches", "balks", "pickoffs", "totalSwings",
"pitchesPerInning", "walksPerStrikeout","lineOuts", "airOuts",
"catchersInterference", "sacBunts", "sacFlies",
"groundOuts", "strikeoutWalkRatio", "popOuts", 'lineOuts', 'groundOuts',
"winningPercentage",
"wins","losses", "atBats", "numberOfPitches",
"gamesPitched", "intentionalWalks",
"strikeoutsPer9Inn", "baseOnBallsPer9",
"hitsPer9Inn","gidp",
"hitBatsmen", "groundIntoDoublePlay", "gamesPlayed","stolenBasePercentage"
)
# Added : groundintodoubleplay, gamesplayed
# pitching_df Kept: "popHits", "lineHits","groundHits", "flyOuts","wins", "losses"
# Remove selected columns
pitching_df <- pitching_df %>%
select(-one_of(columns_to_delete))
# Only keep numerical pitching_df
pitching_df.num <- pitching_df %>%
select_if(is.numeric)
corr_matrix <- cor(pitching_df.num, use = "complete.obs")
corrplot(corr_matrix,
method = "color",
col = colorRampPalette(c("blue", "white", "red"))(200),
type = "upper",
tl.col = "black",
tl.srt = 45,
addCoef.col = "black",
number.cex = 0.35,
tl.cex = 0.5,
number.digits = 3 # Set precision for display
)
Find and print pairs with correlation 1
# Find and print pairs with correlation 1
correlated_columns <- which(corr_matrix == 1, arr.ind = TRUE)
print("Pairs with correlation 1:")
## [1] "Pairs with correlation 1:"
for (i in 1:nrow(correlated_columns)) {
row <- correlated_columns[i, 1]
col <- correlated_columns[i, 2]
# Only print pairs where the row index is less than the column index to avoid duplicates
if (row < col) {
print(paste(colnames(cor_matrix)[row], colnames(cor_matrix)[col]))
}
}
# Replace NA values with the mean of the column
pitching_df.num <- pitching_df.num %>%
mutate(across(everything(), ~ifelse(is.na(.), mean(., na.rm = TRUE), .)))
Show the distributions of all variables
data_long <- pitching_df.num %>%
pivot_longer(everything(), names_to = "variable", values_to = "value")
ggplot(data_long, aes(x = value)) +
geom_density(fill = "blue", alpha = 0.4) +
facet_wrap(~ variable, scales = "free") +
labs(title = "Distributions of All Variables", x="", y="") +
theme_minimal() +
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
strip.text = element_text(size = 7)
)
ggsave("distributions_all.png", plot = last_plot(), width = 8, height = 6, dpi = 300)
# Box plot of all variables
scaled_data <- scale(sqrt(pitching_df.num))
boxplot(scaled_data, las=2)
ERA histrogram
p <- ggplot(pitching_df.num, aes(x = pitching_df$era)) +
geom_histogram(bins = 50, fill = "darkblue", color = "black", alpha = 0.7) +
# geom_boxplot(fill = "darkblue", color = "black", alpha = 0.7) +
labs(title = paste("Histogram of era"), x = "Era", y = "Frequency") +
theme_minimal()
# Print the plot to display it
print(p)
ggsave("histogram_era.png", plot = last_plot(), width = 8, height = 6, dpi = 300)
cor_matrix <- cor(pitching_df[, c("babip", "obp", "slg", "strikeoutsPer9", "homeRunsPer9", "hitsPer9", "era", "whip")])
corrplot(cor_matrix, method = "color", addCoef.col = "black", number.cex=0.7,col = colorRampPalette(c("blue", "white", "red"))(200))
# Correlations for starting pitchers
starter_pitching_df <- pitching_df[pitching_df$gamesStarted > 4, ]
cor_matrix_starters <- cor(starter_pitching_df[, c("era", "whip", "strikeoutsPer9", "homeRunsPer9", "hitsPer9", "baseOnBalls", "inningsPitched", "gamesStarted")])
corrplot(cor_matrix_starters, method = "color", tl.cex = 0.8, addCoef.col = "black", number.cex=0.7, col = colorRampPalette(c("blue", "white", "red"))(200))
# Correlations for relief pitchers
reliever_pitching_df <- pitching_df[pitching_df$gamesFinished > 4, ]
cor_matrix_relievers <- cor(reliever_pitching_df[, c("era", "whip", "strikeoutsPer9", "homeRunsPer9", "hitsPer9", "baseOnBalls", "gamesFinished", "saves", "saveOpportunities", "holds", "blownSaves")])
corrplot(cor_matrix_relievers, method = "color", tl.cex = 0.8, addCoef.col = "black", number.cex=0.7, col = colorRampPalette(c("blue", "white", "red"))(200))
ggplot_era <- ggplot(pitching_df, aes(x = era, y = whip)) +
geom_point(aes(color = gamesStarted)) +
labs(title = "ERA vs WHIP", x = "ERA", y = "WHIP") +
theme_minimal() +
scale_color_gradient(low = "blue", high = "red")
ggplot_era # Lower whip and era values correlate to lower values with the direct other variable
ggsave("era_whip.png", plot = last_plot(), width = 8, height = 6, dpi = 300)
ggplot_strikouts <- ggplot(pitching_df, aes(x = strikeoutsPer9, y = walksPer9Inn)) +
geom_point(aes(color = battersFaced)) +
labs(title = "Strikeouts per 9 vs Home Runs per 9", x = "Strikeouts per 9", y = "Home Runs per 9") +
theme_minimal() +
scale_color_gradient(low = "blue", high = "red")
ggplot_strikouts# Values seem uncorrelated
ggsave("strikeouts_walks.png", plot = last_plot(), width = 8, height = 6, dpi = 300)
# Plot data for Starting pitchers
ggplot(starter_pitching_df, aes(x = battersFaced, y = era)) +
geom_point(aes(color = battersFaced)) +
labs(title = "Strikeouts Over Time", x = "Batters Faced", y = "Strike Outs") +
theme_minimal() +
scale_color_gradient(low = "blue", high = "red")
# plot data for Relievers
ggplot(reliever_pitching_df, aes(x = battersFaced, y = era)) +
geom_point(aes(color = era)) +
labs(title = "Strikeouts Over Time", x = "Batters Faced", y = "Strike Outs") +
theme_minimal() +
scale_color_gradient(low = "blue", high = "red")
# Assuming you have two previously created data frames : starter_pitching_df and reliever_pitching_df
# Add a new column to label the data as "Starter" or "Reliever"
starter_pitching_df$Position <- "Starter"
reliever_pitching_df$Position <- "Reliever"
# Combine the two data frames into one
combined_pitching_df <- rbind(starter_pitching_df, reliever_pitching_df)
# Now, plot both data sets with two boxplots side by side
box_plot_era <-ggplot(combined_pitching_df, aes(x = Position, y = era, fill = Position)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha = 0.6) +
geom_jitter(color = "black", size = 0.4, alpha = 0.9) +
theme_ipsum() +
theme(
legend.position = "none",
plot.title = element_text(size = 11),
axis.text.x = element_text(angle = 0, hjust = 0.5) # Adjust x-axis labels for readability
) +
coord_flip() +
ggtitle("Boxplot of ERA for Starters vs Relievers") +
xlab("Pitcher Type (Starter vs Reliever)")
box_plot_era
ggsave("boxplot_era.png", plot = last_plot(), width = 10, height = 6, dpi = 300)
pca <- prcomp(pitching_df.num, scale = T)
data.frame(z1=-pca$x[,1],z2=pca$x[,2]) %>%
ggplot(aes(z1,z2,label=pitching_df$playerFullName, color=as.numeric(pitching_df$battersFaced)))+
geom_point(size=0) +
labs(title="PCA", x="PC1", y="PC2", color="Batters Faced") +
theme_bw() +
ylim(-5,5) +
geom_text(size=2, hjust=0.6, vjust=0, check_overlap = TRUE)+
scale_color_gradient(low = "blue", high = "red")
## Warning: Removed 71 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 71 rows containing missing values or values outside the scale range
## (`geom_text()`).
# Dimension 1 is starting pitchers
ggplot(data.frame(variable = names(pca$rotation[, 1]), loading = pca$rotation[, 1]),
aes(x = reorder(variable, loading), y = loading)) +
geom_bar(stat = "identity", fill = "darkblue") +
coord_flip() +
theme_minimal() +
labs(x = "Variable", y = "Loading on PC1", title = "Loadings for Principal Component 1") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Optional: rotate axis labels
# Dimension 2 is relief pitchers/closers
ggplot(data.frame(variable = names(pca$rotation[, 2]), loading = pca$rotation[, 2]),
aes(x = reorder(variable, loading), y = loading)) +
geom_bar(stat = "identity", fill = "darkblue") +
coord_flip() +
theme_minimal() +
labs(x = "Variable", y = "Loading on PC2", title = "Loadings for Principal Component 2") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Optional: rotate axis labels
fviz_contrib(pca, choice = "var", axes = 1)
fviz_contrib(pca, choice = "var", axes = 2)
fviz_pca_var(pca, col.var = "contrib")
fviz_pca_biplot(pca, repel = TRUE, col.var = "contrib", geom = "point", pointshape = 21, pointsize = 2, label = "all")
# Calculate proportion of variance explained
var_explained = pca$sdev^2 / sum(pca$sdev^2)
# Create a data frame with the results
summary_df = data.frame(
PC = paste0("PC", 1:length(var_explained)),
Proportion = var_explained,
Cumulative = cumsum(var_explained)
)
# Plot the proportion of variance explained and the cummulative proportion
ggplot(head(summary_df, 9), aes(x = PC, y = Proportion)) +
geom_col(fill = "steelblue") +
geom_line(aes(y = Proportion), group = 1, color = "black") +
geom_point(aes(y = Proportion), color = "black") +
geom_line(aes(y = Cumulative), group = 1, color = "red") +
geom_text(aes(y = Cumulative, label = scales::percent(Cumulative, accuracy = 0.1)),
vjust = -0.5, size = 3) + # Adds text for line peaks
geom_point(aes(y = Cumulative), color = "red") +
geom_text(aes(label = scales::percent(Proportion, accuracy = 0.1)),
vjust = -0.5, size = 3) + # Adds text for bar peaks
scale_y_continuous(labels = scales::percent,
sec.axis = sec_axis(~., name = "Cumulative Proportion")) +
labs(title = "Variance Explained by Principal Components",
x = "Principal Component",
y = "Proportion of Variance Explained") +
theme_minimal()
Print the top 10 players
pitching_df$playerFullName[order(pca$x[,1])][(nrow(pitching_df)-5):nrow(pitching_df)]
## [1] "Nick Burdi" "Zach Penrod" "Riley O'Brien" "Will Klein"
## [5] "Amir Garrett" "Jairo Iriarte"
pitching_df$playerFullName[order(pca$x[,1])][1:10]
## [1] "Aaron Nola" "Kutter Crawford" "Seth Lugo" "Luis Severino"
## [5] "Logan Webb" "José BerrÃos" "Kevin Gausman" "Miles Mikolas"
## [9] "Patrick Corbin" "JP Sears"
We can use Kaiser’s Criterion that says the loadings that could be kept have a eigen value > 1 such that the component explains more variance than a single original variable in the data set
eigenvalues <- pca$sdev^2
components_to_keep <- sum(eigenvalues > 1)
components_to_keep
## [1] 8
8 eigen values can be kept as they have an eigen value > 1
Direct Factor Analysis did not work so we had to increase the tolerance for factor analysis, this is likely because this is a very noisy data set. We did it by removing the highly correlated variables
ev <- eigen(cor(pitching_df.num))
nS <- nScree(x=ev$values)
plotnScree(nS,legend = T)
print(ev$values)
## [1] 1.718456e+01 9.832768e+00 4.305317e+00 3.596905e+00 3.048949e+00
## [6] 2.152778e+00 2.036800e+00 1.572300e+00 9.647913e-01 8.749767e-01
## [11] 8.506455e-01 7.823639e-01 6.557721e-01 5.964013e-01 5.172547e-01
## [16] 4.599051e-01 3.602745e-01 3.038632e-01 2.748158e-01 2.286877e-01
## [21] 2.241795e-01 1.678282e-01 1.498028e-01 1.301776e-01 1.237122e-01
## [26] 1.082414e-01 9.206531e-02 7.848734e-02 6.691190e-02 5.878771e-02
## [31] 4.021497e-02 3.058856e-02 2.748036e-02 2.130271e-02 1.919845e-02
## [36] 1.895951e-02 1.533066e-02 6.572728e-03 6.351744e-03 4.795298e-03
## [41] 2.620511e-03 1.944970e-03 1.854508e-03 8.882215e-04 7.895135e-04
## [46] 6.436148e-04 7.529294e-05 4.501264e-05 1.688850e-05 6.487918e-06
## [51] 1.426479e-14 8.167849e-16
nzv <- nearZeroVar(pitching_df.num, saveMetrics = TRUE)
nzv
pitching_df_reduced <- pitching_df.num[, !nzv$nzv]
# Scale the pitching_df
pitching_df_scaled <- scale(pitching_df_reduced)
# Remove highly correlated variables with a cutoff of 0.95
cor_matrix <- cor(pitching_df_scaled)
high_cor <- findCorrelation(cor_matrix, cutoff = 0.95)
pitching_df_reduced <- pitching_df_scaled[, -high_cor]
fit <- factanal(pitching_df_reduced, factors = 3, rotation = "varimax", n.obs = nrow(pitching_df.num) * 1.5)
fit
##
## Call:
## factanal(x = pitching_df_reduced, factors = 3, n.obs = nrow(pitching_df.num) * 1.5, rotation = "varimax")
##
## Uniquenesses:
## babip obp slg
## 0.643 0.472 0.005
## strikeoutsPer9 homeRunsPer9 hitsPer9
## 0.890 0.363 0.261
## inheritedRunners inheritedRunnersScored bequeathedRunners
## 0.102 0.194 0.422
## bequeathedRunnersScored qualityStarts gamesFinished
## 0.629 0.175 0.729
## gidpOpp swingAndMisses ballsInPlay
## 0.858 0.853 0.779
## strikePercentage pitchesPerPlateAppearance iso
## 0.984 0.949 0.203
## flyOuts flyHits popHits
## 0.110 0.120 0.908
## lineHits groundHits gamesStarted
## 0.061 0.142 0.038
## runs baseOnBalls hitByPitch
## 0.044 0.223 0.642
## era saveOpportunities holds
## 0.316 0.845 0.516
## blownSaves groundOutsToAirouts winPercentage
## 0.577 0.980 0.885
## walksPer9Inn
## 0.928
##
## Loadings:
## Factor1 Factor2 Factor3
## babip 0.596
## obp -0.168 0.707
## slg 0.991 -0.117
## strikeoutsPer9 -0.287 0.163
## homeRunsPer9 0.789 -0.117
## hitsPer9 0.853
## inheritedRunners -0.108 0.938
## inheritedRunnersScored 0.895
## bequeathedRunners 0.629 0.427
## bequeathedRunnersScored 0.502 0.344
## qualityStarts 0.801 -0.421
## gamesFinished -0.168 -0.176 0.460
## gidpOpp 0.357
## swingAndMisses -0.358 0.133
## ballsInPlay 0.121 0.421 -0.169
## strikePercentage 0.115
## pitchesPerPlateAppearance -0.219
## iso 0.883 -0.133
## flyOuts 0.923 -0.192
## flyHits 0.909 0.156 -0.174
## popHits 0.304
## lineHits 0.951 -0.177
## groundHits 0.920 -0.103
## gamesStarted 0.862 -0.468
## runs 0.956 0.152 -0.139
## baseOnBalls 0.877
## hitByPitch 0.596
## era -0.146 0.808
## saveOpportunities -0.210 0.322
## holds -0.193 0.665
## blownSaves -0.127 0.634
## groundOutsToAirouts 0.140
## winPercentage -0.331
## walksPer9Inn -0.260
##
## Factor1 Factor2 Factor3
## SS loadings 7.807 5.501 3.843
## Proportion Var 0.230 0.162 0.113
## Cumulative Var 0.230 0.391 0.504
##
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 16769.81 on 462 degrees of freedom.
## The p-value is 0
We used the first 10 components from the PCA to perform factor analysis
fa_pca <- prcomp(pitching_df.num, scale = T)
summary(fa_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 4.1454 3.1357 2.07493 1.89655 1.74612 1.4672 1.42717
## Proportion of Variance 0.3305 0.1891 0.08279 0.06917 0.05863 0.0414 0.03917
## Cumulative Proportion 0.3305 0.5196 0.60236 0.67153 0.73016 0.7716 0.81073
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 1.25391 0.98224 0.93540 0.92230 0.88451 0.80980 0.77227
## Proportion of Variance 0.03024 0.01855 0.01683 0.01636 0.01505 0.01261 0.01147
## Cumulative Proportion 0.84097 0.85952 0.87635 0.89271 0.90775 0.92036 0.93183
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.71920 0.67816 0.60023 0.55124 0.52423 0.4782 0.47348
## Proportion of Variance 0.00995 0.00884 0.00693 0.00584 0.00528 0.0044 0.00431
## Cumulative Proportion 0.94178 0.95062 0.95755 0.96340 0.96868 0.9731 0.97739
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.40967 0.38704 0.3608 0.35173 0.32900 0.30342 0.28016
## Proportion of Variance 0.00323 0.00288 0.0025 0.00238 0.00208 0.00177 0.00151
## Cumulative Proportion 0.98062 0.98350 0.9860 0.98838 0.99046 0.99223 0.99374
## PC29 PC30 PC31 PC32 PC33 PC34 PC35
## Standard deviation 0.25867 0.24246 0.20054 0.17490 0.16577 0.14595 0.13856
## Proportion of Variance 0.00129 0.00113 0.00077 0.00059 0.00053 0.00041 0.00037
## Cumulative Proportion 0.99503 0.99616 0.99693 0.99752 0.99805 0.99846 0.99883
## PC36 PC37 PC38 PC39 PC40 PC41 PC42
## Standard deviation 0.13769 0.12382 0.08107 0.07970 0.06925 0.05119 0.04410
## Proportion of Variance 0.00036 0.00029 0.00013 0.00012 0.00009 0.00005 0.00004
## Cumulative Proportion 0.99919 0.99949 0.99961 0.99974 0.99983 0.99988 0.99992
## PC43 PC44 PC45 PC46 PC47 PC48
## Standard deviation 0.04306 0.02980 0.02810 0.02537 0.008677 0.006709
## Proportion of Variance 0.00004 0.00002 0.00002 0.00001 0.000000 0.000000
## Cumulative Proportion 0.99995 0.99997 0.99998 1.00000 1.000000 1.000000
## PC49 PC50 PC51 PC52
## Standard deviation 0.00411 0.002547 9.903e-16 3.026e-16
## Proportion of Variance 0.00000 0.000000 0.000e+00 0.000e+00
## Cumulative Proportion 1.00000 1.000000 1.000e+00 1.000e+00
# Pick the first 10 components
compnets_for_fa <- fa_pca$x[, 1:10]
fa <- factanal(compnets_for_fa, factors = 5, rotation = "varimax")
fa
##
## Call:
## factanal(x = compnets_for_fa, factors = 5, rotation = "varimax")
##
## Uniquenesses:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
## 1.00 0.75 0.75 1.00 1.00 0.75 0.75 1.00 1.00 0.75
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4 Factor5
## PC1
## PC2 0.490
## PC3 0.492
## PC4
## PC5
## PC6 0.492
## PC7 0.486
## PC8
## PC9
## PC10 0.499
##
## Factor1 Factor2 Factor3 Factor4 Factor5
## SS loadings 0.250 0.250 0.250 0.250 0.250
## Proportion Var 0.025 0.025 0.025 0.025 0.025
## Cumulative Var 0.025 0.050 0.075 0.100 0.125
##
## Test of the hypothesis that 5 factors are sufficient.
## The chi square statistic is 0 on 5 degrees of freedom.
## The p-value is 1
# Scree plot to determine number of factors using Kaiser rule
eigenvalues <- fa_pca$sdev^2
plot(eigenvalues, type = "b",
# Smaller circle
pch = 0,
xlab = "Principal Component",
ylab = "Eigenvalue",
main = "Scree Plot")
abline(h = 1, col = "red", lty = 2) # Kaiser rule threshold line at eigenvalue = 1
loadings_df <- data.frame(Variable = rownames(fa$loadings))
for(i in 1:ncol(fa$loadings)) {
loadings_df[paste0("Factor ", i)] <- fa$loadings[,i]
}
loadings_long <- melt(loadings_df,
id.vars = "Variable",
variable.name = "Factor",
value.name = "Loading")
ggplot(loadings_long, aes(x = Variable, y = Loading)) +
geom_col(fill = "lightgreen", width = 0.7) +
facet_grid(Factor~.) +
geom_text(aes(label = round(Loading, 2)),
color = "black",
vjust = 1.3,
size = 2.5) +
labs(title = "Factor Loadings",
x = "Component",
y = "Loading") +
theme_minimal() +
theme(
panel.grid = element_blank(),
axis.text.y = element_blank()
)
# First how much centers should we have
fviz_nbclust(pitching_df.num, kmeans, method = 'silhouette', k.max = 10) # -> shows 2 clusters is optimal
k = 3 # -> I chose 3 anyway
fit = kmeans(scale(pitching_df.num), k, nstart=100)
groups = fit$cluster
# How many players are in each cluster
barplot(table(groups), col="darkblue")
# Create unique labels by appending an index to each duplicate name
unique_labels <- make.unique(as.character(pitching_df$playerFullName))
rownames(pitching_df.num) <- unique_labels
fviz_cluster(fit, pitching_df.num, geom = "text", labelsize=8, main = "Clusters with K-means")
# More readable plot
fviz_cluster(fit, data = pitching_df.num, geom = "point", label = "none", main = "Clusters with K-means") + geom_text(label = unique_labels, check_overlap = T, size = 3)
Plot of the contribution of each variable to each cluster
centers=fit$centers
tidy = cbind(
gather(as.data.frame(t(centers)), "cluster", "coor"),
var=rep(colnames(centers, k)),
size=rep(table(fit$cluster), each=ncol(centers))
)
tidy %>%
ggplot(aes(x=cluster, y=coor, fill=cluster)) +
geom_col() +
facet_wrap(~var) +
geom_text(aes(label=size),position=position_stack(1.2))
Different plot also showing how each variable contributes to the clusters
centers=fit$centers
tidy = cbind(
gather(as.data.frame(t(centers)), "cluster", "coor"),
var=rep(colnames(centers, k)),
size=rep(table(fit$cluster), each=ncol(centers))
)
tidy %>%
ggplot(aes(x=var, y=coor, fill=var)) +
geom_col() +
coord_flip() +
facet_wrap(~cluster) +
theme(axis.text.x = element_text(angle=45),legend.position = "none")
Showing clusters individually
# Use the colors as in fviz_cluster
colors <- c("#fc7671", "#00bb46", "#5d9bfa")
groups = fit$cluster
lapply(1:3, function(cluster_id) {
# Subset data
subset_data <- pitching_df.num[groups == cluster_id, ]
# Re-run clustering and plot
fit_subset <- kmeans(scale(subset_data), centers = 1, nstart = 50)
p2 <- fviz_cluster(fit_subset,
data = subset_data,
geom = c("point", "text"),
labelsize = 6,
repel = F,
ellipse.type = "convex",
ggtheme = theme_minimal()) +
scale_color_manual(values = colors[cluster_id]) +
theme(legend.position = "none") +
ggtitle(paste("Cluster", cluster_id))
})
## [[1]]
##
## [[2]]
##
## [[3]]
A silhouette plot to show how well the clusters are separated Eclust scalss the data automatically for us
fit.kmeans = eclust(pitching_df.num, "kmeans", k=3, stand=TRUE, nstart=100, graph = FALSE)
fviz_silhouette(fit.kmeans)
## cluster size ave.sil.width
## 1 1 137 0.36
## 2 2 207 0.19
## 3 3 304 0.13
WSS method Silhouette method Gap statistic method
# WSS
fviz_nbclust(scale(pitching_df.num), kmeans, method = "wss", k.max = 20, nstart = 200, iter.max = 100, nboot = 100)
# Silhouette
fviz_nbclust(scale(pitching_df.num), kmeans, method = "silhouette", k.max = 20, nstart = 1000, iter.max = 100)
# Gap statistic
fviz_nbclust(scale(pitching_df.num), kmeans, method = 'gap_stat', k.max = 7, nstart = 20, nboot = 100, iter.max = 500)
Does the # of clusters correlate to without pca?
No the optimal still says 2
cluster_pca <- prcomp(scale(pitching_df.num), scale = T)
my_pca_pitching_df <- data.frame(cluster_pca$x[, 1:2])
fviz_nbclust(my_pca_pitching_df, kmeans, method = 'silhouette', k.max = 10)
my_kmeans <- kmeans(my_pca_pitching_df, centers = 3)
rownames(my_pca_pitching_df) <- unique_labels
fviz_cluster(my_kmeans, data = my_pca_pitching_df, geom = "text", labelsize=8)
centers=my_kmeans$centers
tidy = cbind(
gather(as.data.frame(t(centers)), "cluster", "coor"),
var=rep(colnames(centers, k)),
size=rep(table(my_kmeans$cluster), each=ncol(centers))
)
tidy %>%
ggplot(aes(x=cluster, y=coor, fill=cluster)) +
geom_col() +
facet_wrap(~var) +
geom_text(aes(label=size),position=position_stack(1.2))
centers=my_kmeans$centers
tidy = cbind(
gather(as.data.frame(t(centers)), "cluster", "coor"),
var=rep(colnames(centers, k)),
size=rep(table(my_kmeans$cluster), each=ncol(centers))
)
tidy %>%
ggplot(aes(x=var, y=coor, fill=var)) +
geom_col() +
coord_flip() +
facet_wrap(~cluster) +
theme(axis.text.x = element_text(angle=45),legend.position = "none")
fit.pam = eclust(pitching_df.num, FUNcluster="pam", stand = TRUE, k=3,
graph = T, nstart=1000)
fviz_cluster(fit.pam, geom="point", main="Clusters with PAM", ellipse.type = "norm")+
geom_text(label=pitching_df$playerFullName, check_overlap = T)
# WSS
fviz_nbclust(scale(pitching_df.num), pam, method="wss", k.max=10)
# Silhouette
fviz_nbclust(scale(pitching_df.num), pam, method="silhouette", k.max=10)
# Gap statistic
fviz_nbclust(scale(pitching_df.num), pam, method="gap_stat", k.max=10)
adjustedRandIndex(fit.kmeans$cluster, fit.pam$clustering)
## [1] 0.7661753
create_kde_plot <- function(pitching_df, col_name) {
p <- ggplot(pitching_df, aes_string(x = col_name)) +
geom_density(fill = "darkblue", color = "black", alpha = 0.7) +
labs(title = paste("Kernel Density Estimation of", col_name),
x = col_name, y = "Density") +
theme_minimal()
print(p)
}
for (col_name in names(pitching_df.num)) {
if (is.numeric(pitching_df.num[[col_name]])) {
create_kde_plot(pitching_df.num, col_name)
}
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.